In [1]:
import os
import pandas as pd
import re
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import plotly.graph_objects as go
import plotly.express as px

data_name = "data"
train_bn = "train.csv"
test_bn = "test.csv"
results_bn = "results.csv"

proj_dir = os.path.abspath(
        os.path.join(os.path.abspath(__name__), os.pardir, os.pardir))
data_dir = os.path.join(proj_dir, data_name)
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)
results_fn = os.path.join(data_dir, results_bn)

if not os.path.exists(data_dir):
    raise OSError("Data directory not properly setup.")
In [2]:
import sklearn.preprocessing as preprocessing

class LabelEncoderExt(preprocessing.LabelEncoder):
    
    UNK = "UNK"
    
    def __init__(self):
        
        super().__init__()
        
    def fit(self, y):
        
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([self.UNK])))
        super().fit(y)
        
    def transform(self, y):
        
        y[~np.isin(y, self.classes_, assume_unique=True)] = self.UNK
        return super().transform(y)
    
    def fit_transform(self, y):
        
        self.fit(y)
        return self.transform(y)
In [3]:
try: 
    df_train = pd.read_csv(train_fn)
except OSError as e:
    print("Training file missing.")
try:
    df_test = pd.read_csv(test_fn)
except OSError as e:
    print("Test file missing.")
In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
In [5]:
name = "Name"
sex = "Sex"
emb = "Embarked"
cabin = "Cabin"
age = "Age"
fare = "Fare"
ticket = "Ticket"
sib = "SibSp"
par = "Parch"
pclass = "Pclass"
ticket_count = "ticket_count"
num_cab = "num_cab"

family = "family"
fam_size = "fam_size"
fam_per_ticket = "fam_per_ticket"
single = "single"
child = "child"

dummy_cols = [
        pclass, 
        fam_size,
        name, 
        cabin, 
        emb,
        num_cab,
        fare,
        age
        ]
pid = "PassengerId"
survived = "Survived"
dep_vars = [survived]
indices = [pid]
ind_vars = [x for x in df_train.columns if x not in (dep_vars+indices+[ticket])]
In [6]:
df_X = pd.concat([df_train, df_test], ignore_index=True)
In [7]:
df_X.head(30)
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0.0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1.0 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0.0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0.0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0.0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0.0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1.0 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1.0 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
10 11 1.0 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S
11 12 1.0 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S
12 13 0.0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NaN S
13 14 0.0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NaN S
14 15 0.0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NaN S
15 16 1.0 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NaN S
16 17 0.0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NaN Q
17 18 1.0 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
18 19 0.0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NaN S
19 20 1.0 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
20 21 0.0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NaN S
21 22 1.0 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S
22 23 1.0 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NaN Q
23 24 1.0 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S
24 25 0.0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NaN S
25 26 1.0 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NaN S
26 27 0.0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
27 28 0.0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S
28 29 1.0 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
29 30 0.0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S
In [8]:
df_train.count()
Out[8]:
PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64
In [9]:
df_test.count()
Out[9]:
PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64
In [10]:
fig = px.histogram(df_train, x=fare, color=survived, marginal="rug", 
                               nbins=200, hover_data=df_train.columns)
fig.update_layout(go.Layout(
        xaxis = dict(
            rangeslider = {'visible': True},
        ),)
        )
fig.show()
In [11]:
fig = px.histogram(df_train, x=age, color=survived, marginal="rug", 
                               nbins=50, hover_data=df_train.columns)
fig.update_layout(go.Layout(
        xaxis = dict(
            rangeslider = {'visible': True},
        ),)
        )
fig.show()
In [12]:
def pre_encoding(df):
    thresh = 0.01
    unk = "UNK"
    reg_ex = "\w+\s?\w*(\.)"
    reg = re.compile(reg_ex)
    cabin_mapping = {'': '', 'A': 'A', 'B': 'B', 'C': 'B',
               'D': 'D', 'E':'D', 'F':'F', 'G': 'F', 'T': 'A'}
    f_size = {1: 0, 2: 1, 3: 1, 4: 2, 5: 2, 
              6: 3, 7: 3, 8: 4, 9: 4, 10: 4, 11: 4}
    
    h = lambda x: x[0] if len(x) > 0 else ''
    df[sex] = df[sex].apply(h)
    
    f = lambda x: x.split(',')[1].strip()
    g = lambda x: reg.match(x).group()
    df[name] = df[name].apply(f).apply(g)
    freq = df[name].value_counts(normalize=True)
    k = lambda x: x if freq[x] >= thresh else unk
    df[name] = df[name].apply(k)
    
    tmp = df.groupby(ticket).count()
    df[ticket_count] = df[ticket].apply(lambda x: tmp.at[x, pid])

    df[cabin] = df[cabin].fillna('')
    numb_cab = lambda x: len(x.split())
    df.loc[df[cabin].notnull(), num_cab] = (
            df.loc[df[cabin].notnull(), cabin].apply(numb_cab)) 
    
    combine_decks = lambda x: cabin_mapping[x]
    df.loc[df[cabin].notnull(), cabin] = (
            df.loc[df[cabin].notnull(), cabin].apply(h).apply(combine_decks))
    
    df[family] = df[par] + df[sib] + 1
    df[fam_per_ticket] = df[family]/df[ticket_count]

    ticket_prefix = lambda x: 1 if (len(x.split())>1) else 0
    df[ticket] = df[ticket].apply(ticket_prefix) 
    
    df[single] = df[family].apply(lambda x: 1 if x==1 else 0)
    
    df[fam_size] = df[family].apply(lambda x: f_size[x])

    return df


def build_encoders(df):
    enc_name = LabelEncoderExt()
    enc_sex = preprocessing.LabelEncoder()
    enc_emb = preprocessing.LabelEncoder()
    enc_cabin = preprocessing.LabelEncoder()

    enc_name.fit(df[name])
    enc_sex.fit(df[sex].dropna())
    enc_emb.fit(df[emb].dropna())
    enc_cabin.fit(df[cabin].dropna())
    enc_dict = {name: enc_name, sex: enc_sex, emb: enc_emb, cabin: enc_cabin}
 
    scl_age = preprocessing.StandardScaler()
    scl_age.fit(df[[age]].dropna().values)
    scl_fare = preprocessing.StandardScaler()
    scl_fare.fit(df[[fare]].dropna().values)
    scl = {age: scl_age, fare: scl_fare}
        
    return enc_dict, scl


def scale(df, scl):
    df.loc[df[age].notnull(), age] = (
            scl[age].transform(df[[age]].dropna().values))
    df.loc[df[fare].notnull(), fare] = (
            scl[fare].transform(df[[fare]].dropna().values))    
    return df


def naive_bayes_data_fill(df, enc_dict):
    df.loc[df[emb].notnull(), emb] = (
            enc_dict[emb].transform(df[emb].dropna().values))
    df.loc[:, sex] = enc_dict[sex].transform(df[sex].values)
    df.loc[df[cabin].notnull(), cabin] = (
            enc_dict[cabin].transform(df[cabin].dropna().values))
    df.loc[:, name] = enc_dict[name].transform(df[name].values)
    
    criteria = [pclass, sex, par] 
    tmp = df[criteria+[emb]].dropna()
    index = df.index.isin(tmp.index)
    
    X = tmp[criteria].values.astype(np.int)
    Y = tmp[emb].values.astype(np.int)
    tmp2 = df.loc[~index, criteria]
    if len(tmp2) > 0:
        clf = MultinomialNB()
        clf.fit(X, Y)
        df.loc[~index, emb] = clf.predict(tmp2)
    
    return df


def mean_data_fill(df):
    group = [pclass, emb]
    nan_fares = df[fare].isnull()
    
    tmp = df.groupby(group).mean()[[fare]]
    tmp2 = df.loc[nan_fares][group]
    ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp.index.names)
    df.loc[nan_fares, fare] = tmp.loc[ind, fare].fillna(0).values
    
    group = [name, pclass, sex]
    nan_ages = df[age].isnull()
    
    tmp1 = df.groupby(group).mean()[[age]]
    tmp2 = df.loc[nan_ages][group]
    ind = pd.MultiIndex.from_arrays(tmp2.values.T, names=tmp1.index.names)
    df.loc[nan_ages, age] = tmp1.loc[ind, age].fillna(0).values

    return df


def discretize(df):
    age_bins = 10
    fare_bins = 6
    df[age] = pd.cut(df_X[age], bins=age_bins, 
                     labels=np.arange(0, age_bins))
    df[fare] = pd.cut(df_X[fare], bins=fare_bins, 
                     labels=np.arange(0, fare_bins))
    return df


def preprocess(df, enc_dict=None, scl=None):
    df = pre_encoding(df)
    if enc_dict is None:
        enc_dict, scl = build_encoders(df)
    df = scale(df, scl)
    df = naive_bayes_data_fill(df, enc_dict)
    
    df = mean_data_fill(df)
    df = discretize(df)
    
    return df, enc_dict, scl
In [13]:
df_X, enc_dict, scl = preprocess(df_X)
In [14]:
df_X.head(30)
Out[14]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked ticket_count num_cab family fam_per_ticket single fam_size
0 1 0.0 3 2 1 2 1 0 1 0 0 2 1 0 2 2.0 0 1
1 2 1.0 1 3 0 4 1 0 1 0 2 0 2 1 2 1.0 0 1
2 3 1.0 3 1 0 3 0 0 1 0 0 2 1 0 1 1.0 1 0
3 4 1.0 1 3 0 4 1 0 0 0 2 2 2 1 2 1.0 0 1
4 5 0.0 3 2 1 4 0 0 0 0 0 2 1 0 1 1.0 1 0
5 6 0.0 3 2 1 3 0 0 0 0 0 1 1 0 1 1.0 1 0
6 7 0.0 1 2 1 6 0 0 0 0 3 2 2 1 1 0.5 1 0
7 8 0.0 3 0 1 0 3 1 0 0 0 2 5 0 5 1.0 0 2
8 9 1.0 3 3 0 3 0 2 0 0 0 2 3 0 3 1.0 0 1
9 10 1.0 2 3 0 1 1 0 0 0 0 0 2 0 2 1.0 0 1
10 11 1.0 3 1 0 0 1 1 1 0 4 2 3 1 3 1.0 0 1
11 12 1.0 1 1 0 7 0 0 0 0 2 2 1 1 1 1.0 1 0
12 13 0.0 3 2 1 2 0 0 1 0 0 2 1 0 1 1.0 1 0
13 14 0.0 3 2 1 4 1 5 0 0 0 2 7 0 7 1.0 0 3
14 15 0.0 3 1 0 1 0 0 0 0 0 2 1 0 1 1.0 1 0
15 16 1.0 2 3 0 6 0 0 0 0 0 2 1 0 1 1.0 1 0
16 17 0.0 3 0 1 0 4 1 0 0 0 1 6 0 6 1.0 0 3
17 18 1.0 2 2 1 4 0 0 0 0 0 2 1 0 1 1.0 1 0
18 19 0.0 3 3 0 3 1 0 0 0 0 2 2 0 2 1.0 0 1
19 20 1.0 3 3 0 4 0 0 0 0 0 0 1 0 1 1.0 1 0
20 21 0.0 2 2 1 4 0 0 0 0 0 2 2 0 1 0.5 1 0
21 22 1.0 2 2 1 4 0 0 0 0 3 2 1 1 1 1.0 1 0
22 23 1.0 3 1 0 1 0 0 0 0 0 1 1 0 1 1.0 1 0
23 24 1.0 1 2 1 3 0 0 0 0 1 2 1 1 1 1.0 1 0
24 25 0.0 3 1 0 0 3 1 0 0 0 2 5 0 5 1.0 0 2
25 26 1.0 3 3 0 4 1 5 0 0 0 2 7 0 7 1.0 0 3
26 27 0.0 3 2 1 3 0 0 0 0 0 0 1 0 1 1.0 1 0
27 28 0.0 1 2 1 2 3 2 0 3 2 2 6 3 6 1.0 0 3
28 29 1.0 3 1 0 2 0 0 0 0 0 1 1 0 1 1.0 1 0
29 30 0.0 3 2 1 3 0 0 0 0 0 2 1 0 1 1.0 1 0
In [15]:
df_X = df_X.drop([ticket], axis=1)
df_X = df_X.drop([par, sib, family], axis=1)
df_X = df_X.drop([fam_per_ticket], axis=1)
In [16]:
df_train = df_X.iloc[:df_train.shape[0]]
df_test = df_X.iloc[df_train.shape[0]:]
In [17]:
feat_list = [emb, name, pclass, fam_size, cabin,
            ticket_count,
            num_cab,
            fare,
            age,
            ]

for feat in feat_list:
    df_tmp = df_train.groupby([feat, survived]).count().reset_index()
    fig = px.bar(df_tmp, x=feat, y=pid, color=survived)
    fig.show()
In [18]:
X_train = pd.get_dummies(df_train, columns=dummy_cols).drop(dep_vars+indices, axis=1)
X_test = pd.get_dummies(df_test, columns=dummy_cols).drop(indices, axis=1)

X_test = X_test.join(pd.DataFrame({x: 0 for x in X_train.columns if x not in X_test.columns}, index=X_test.index))
X_test = X_test[X_train.columns]

Y_train = df_train[dep_vars]
In [19]:
SEED = 0
np.random.seed(seed=SEED)
clf = RandomForestClassifier(n_estimators=100, max_depth=4,
                            criterion="gini",
                            min_samples_split=3, min_samples_leaf=5)
clf.fit(X_train, Y_train.values.ravel())

pred = clf.predict(X_test)
results = pd.DataFrame({indices[0]: df_test[indices[0]].values, dep_vars[0]: pred}).astype({survived: int})
In [20]:
results.to_csv(results_fn, index=False)
In [21]:
print("Accuracy:", accuracy_score(Y_train, clf.predict(X_train)))
pd.DataFrame(confusion_matrix(Y_train, clf.predict(X_train)))
Accuracy: 0.835016835016835
Out[21]:
0 1
0 497 52
1 95 247
In [22]:
estimator = clf.estimators_[0]

import sklearn.tree as tree
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot()
a = tree.plot_tree(estimator, feature_names=X_train.columns, filled=True)